load data: 224 subjects, with 791 goals are included in the following analysis
goalRating_long_R <- read.csv("./inputs/goalRating_long_R.csv",stringsAsFactors = F)
indivDiffDf <- read.csv("./inputs/indivDiffDf.csv",stringsAsFactors = F)
goalDf_sum_wide <- read.csv("./inputs/goalDf_wide.csv",stringsAsFactors = F)
Check the number of missing data per variable, and below is the top 5 variables. Missing data is rare for all variables
# check the number of "I'm not sure" responses per variable
totalGoal <- nrow(goalRating_long_R)/35
goalRating_long_R %>%
filter(is.na(rating)) %>%
tabyl(variable) %>%
mutate(percent = n/totalGoal) %>%
arrange(desc(percent)) %>%
head(5)
## variable n percent
## commitment 6 0.006807352
## importance 5 0.005672793
## instrumentality 5 0.005672793
## regret 5 0.005672793
## conflict 4 0.004538235
“construal_level”,“approach_avoidance” and “attainment_maintenance” question have an option for “I’m not sure” because they ask subjects to categorize their goals.
around 2% of the goals had “I’m not sure” as the response.
# check the number of "I'm not sure" responses per varialbe
goalRating_long_R %>%
filter(rating == 99) %>%
tabyl(variable) %>%
mutate(percent = n/totalGoal) %>%
arrange(desc(percent))
## variable n percent
## construal_level 23 0.02609485
## attainment_maintenance_R 16 0.01815294
## approach_avoidance_R 15 0.01701838
temporal_duration, frequency and end_state_specificity question have an option for “not specified” because they ask about features that may not be applicable to all goals.
The end state specificity is not applicable to around 9% of the goals
# check the number of "not specified" responses per varialbe
goalRating_long_R %>%
filter(rating == 999) %>%
tabyl(variable) %>%
mutate(percent = n/totalGoal) %>%
arrange(desc(percent))
## variable n percent
## end_state_specificity_R 82 0.09303381
## temporal_duration 43 0.04878602
## frequency_R 23 0.02609485
All “I’m not sure” and “not specified” responses will be treated as missing data.
# transform 99 & 999 to NAs
goalRating_long_R <- goalRating_long_R %>%
mutate(rating = replace(rating, rating == 99 | rating == 999, NA))
Descriptive on the number of goals subject claimed to have prior to listing them (in the SONA study, hte median of claimed goal is 4)
describe(goalDf_sum_wide$total_goal)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 224 4.38 13.69 3 3.03 1.48 1 200 199 13.22 183.97 0.91
Visualize the number of claimed goals per subject after excluding the extreme value (> 50) (we have 1 claimed 50, 1 claimed 200)
breaks = (1:20)
goalDf_sum_wide %>%
filter(total_goal < 50) %>%
ggplot(aes(x = total_goal)) +
scale_x_continuous(labels=scales::comma(breaks, accuracy = 1), breaks=breaks) +
geom_histogram(fill = "orange",
colour = "black",
binwidth = 1) +
labs(x="Number of claimed goals", y="# of participants") +
theme_classic(base_size = 18)
The percentage of subjects who claimed having more than 5 goals: 6.25%
# get the number of total subject
totalSub <- nrow(indivDiffDf)
length(goalDf_sum_wide$total_goal[goalDf_sum_wide$total_goal>5])/totalSub
## [1] 0.0625
Descriptive on the number of goals participants actual listed (in the SONA study, the mean is 4.13)
describe(goalDf_sum_wide$listNum)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 224 3.53 1.37 3 3.62 1.48 1 5 4 -0.26 -1.35 0.09
breaks <- (1:5)
goalDf_sum_wide %>%
ggplot(aes(x = listNum)) +
scale_x_continuous(labels=scales::comma(breaks, accuracy = 1), breaks=seq(1, 5, by = 1)) +
geom_histogram(fill = "orange",
colour = "black",
binwidth = 1) +
labs(x="Number of listed goals", y="# of participants") +
theme_classic(base_size = 18)
number of people who listed 1 goal: 15 (SONA study: 1)
length(goalDf_sum_wide$listNum[goalDf_sum_wide$listNum == 1])
## [1] 15
descriptive on the differences between the number of claimed goals and listed goals (after exclude the 2 extreme cases)
goalDf_sum_wide <-goalDf_sum_wide %>%
mutate(diffNum = total_goal - listNum)
goalDf_sum_wide_clean <- goalDf_sum_wide %>%filter(total_goal < 50)
describe(goalDf_sum_wide_clean$diffNum)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 222 -0.23 2 0 -0.25 0 -4 15 19 4.35 31.11 0.13
breaks <- (-4:15)
goalDf_sum_wide_clean %>%
ggplot(aes(x = diffNum)) +
scale_x_continuous(labels=scales::comma(breaks, accuracy = 1), breaks=breaks) +
geom_histogram(fill = "orange",
colour = "black",
binwidth = 1) +
labs(x="Number of claimed goals - listed goals", y="# of participants") +
theme_classic(base_size = 18)
percentage of people who listed more goals than they claimed: 22.3%
length(goalDf_sum_wide$diffNum[goalDf_sum_wide$diffNum <0])/totalSub *100
## [1] 22.32143
percentage of people who listed less goals more than they claimed: 7.5%
length(goalDf_sum_wide$diffNum[goalDf_sum_wide$diffNum >0])/totalSub *100
## [1] 7.589286
Compared to the SONA study, more people listed more goals than they claimed, which may indecate a priming effect of the goal listing task.
# descriptive stats for each variable
goalRating_long_R %>%
dplyr::select(variable, rating) %>%
group_by(variable) %>%
summarize(mean = mean(rating, na.rm = TRUE),
sd = sd(rating, na.rm = TRUE),
n = n(),
min = min(rating, na.rm = TRUE),
max = max(rating, na.rm = TRUE),
skew = skew(rating, na.rm = T),
kurtosi = kurtosi(rating, na.rm = T)
) %>%
arrange(skew) %>%
kable(format = "html", escape = F) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F,position = "center")
## `summarise()` ungrouping output (override with `.groups` argument)
| variable | mean | sd | n | min | max | skew | kurtosi |
|---|---|---|---|---|---|---|---|
| specificity | 5.703046 | 1.4490556 | 791 | 1 | 7 | -1.3968834 | 1.7409083 |
| ideal_motivation | 5.762025 | 1.3421417 | 791 | 1 | 7 | -1.3019606 | 1.7582688 |
| clarity | 5.797468 | 1.1922835 | 791 | 1 | 7 | -1.2591632 | 1.9628696 |
| identified_motivation | 6.029077 | 1.0805128 | 791 | 1 | 7 | -1.2361306 | 1.7466351 |
| control | 5.951899 | 1.1851276 | 791 | 1 | 7 | -1.2304266 | 1.5682491 |
| importance | 6.019084 | 1.1268092 | 791 | 1 | 7 | -1.2272333 | 1.6847051 |
| initial_time_R | 6.517067 | 1.5925140 | 791 | 1 | 8 | -1.2246051 | 1.1945486 |
| basic_needs | 5.433460 | 1.6017743 | 791 | 1 | 7 | -1.1377953 | 0.7998718 |
| attractiveness_achievement | 5.937975 | 1.0536681 | 791 | 1 | 7 | -1.1165131 | 1.5995647 |
| social_desirability | 5.936709 | 1.1069749 | 791 | 1 | 7 | -1.0229599 | 0.9731957 |
| commitment | 5.938853 | 1.1501248 | 791 | 1 | 7 | -1.0061179 | 0.6538724 |
| commonality | 5.406329 | 1.5405661 | 791 | 1 | 7 | -0.9758224 | 0.4364142 |
| attainability | 8.474619 | 2.0460059 | 791 | 1 | 11 | -0.9712587 | 0.8349664 |
| instrumentality | 5.367685 | 1.5585144 | 791 | 1 | 7 | -0.9293689 | 0.2879043 |
| attractiveness_progress | 5.655260 | 1.1130460 | 791 | 1 | 7 | -0.9026328 | 1.0765550 |
| regret | 5.368957 | 1.5700227 | 791 | 1 | 7 | -0.8978937 | 0.1990149 |
| measurability | 5.624842 | 1.4200134 | 791 | 1 | 7 | -0.8905870 | 0.1867125 |
| meaningfulness | 5.269377 | 1.5353726 | 791 | 1 | 7 | -0.7894562 | 0.1375735 |
| temporal_duration | 3.076305 | 0.9396327 | 791 | 1 | 4 | -0.7133993 | -0.4740105 |
| construal_level | 5.002614 | 1.8016843 | 791 | 1 | 7 | -0.6717568 | -0.5209142 |
| visibility | 4.977128 | 1.7797233 | 791 | 1 | 7 | -0.6661160 | -0.4917870 |
| approach_avoidance_R | 5.047742 | 2.3403089 | 791 | 1 | 7 | -0.6499189 | -1.2634578 |
| difficulty | 5.363291 | 1.3286048 | 791 | 1 | 7 | -0.6070920 | -0.0754915 |
| affordance | 5.150824 | 1.4404649 | 791 | 1 | 7 | -0.6002490 | -0.1305322 |
| external_importance | 4.657795 | 1.8875778 | 791 | 1 | 7 | -0.5408762 | -0.7979445 |
| effort | 5.080051 | 1.5014653 | 791 | 1 | 7 | -0.5213915 | -0.4184612 |
| urgency | 4.989848 | 1.5097812 | 791 | 1 | 7 | -0.4850394 | -0.3154366 |
| introjected_motivation | 4.348101 | 2.0268390 | 791 | 1 | 7 | -0.4139940 | -1.1128018 |
| intrinsic_motivation | 4.465057 | 2.0195574 | 791 | 1 | 7 | -0.4041309 | -1.1038115 |
| connectedness | 4.517154 | 1.8848932 | 791 | 1 | 7 | -0.3863797 | -0.9558996 |
| external_motivation | 4.130545 | 2.0977247 | 791 | 1 | 7 | -0.2854198 | -1.3223306 |
| advancement | 6.314322 | 2.9241790 | 791 | 1 | 11 | -0.1142650 | -1.1289960 |
| ought_motivation | 3.898734 | 2.1537537 | 791 | 1 | 7 | -0.0842966 | -1.4430863 |
| procrastination | 4.048223 | 1.8357482 | 791 | 1 | 7 | -0.0820074 | -1.1796906 |
| attainment_maintenance_R | 3.994832 | 2.4741822 | 791 | 1 | 7 | 0.0632405 | -1.6729519 |
| frequency_R | 1.474576 | 0.4996791 | 791 | 1 | 2 | 0.1016276 | -1.9922642 |
| end_state_specificity_R | 1.931915 | 0.8940554 | 791 | 1 | 3 | 0.1333064 | -1.7376529 |
| conflict | 3.510801 | 2.0373916 | 791 | 1 | 7 | 0.1759990 | -1.3270935 |
| failure | 1.710127 | 0.8651974 | 791 | 1 | 3 | 0.5938757 | -1.4052165 |
# order based on their skewness
#kable(varDf[order(varDf$skew),])
The trend showed in these histograms are very similar to the SONA study
# histograms for each dimension
goalRating_long_R %>%
ggplot(aes(x = rating)) +
geom_histogram(fill = "orange",
colour = "black",
alpha = .6) +
facet_wrap(~variable, nrow = 7)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
“pairwise.complete.obs” is used for generating correlational matrix.The correlations make sense
# transform the long format to short format
goalDf_wide <- goalRating_long_R %>% spread (variable, rating)
# generate a correctional matrix
corrM_all <- goalDf_wide %>%
dplyr :: select(advancement:visibility) %>%
cor(use = "pairwise.complete.obs")
# visualization
corrplot(corrM_all, method = "circle",number.cex = .7, order = "AOE", addCoef.col = "black",type = "upper",col= colorRampPalette(c("midnightblue","white", "orange"))(200))
### Variance Partition
Only the 31 variables for goal representation are included. Only around 8.6% of the variance is on the between subject level.
# subset the long format dataset for only the 31 goal representation variable
goal_striving <- c("commitment", "urgency", "effort", "advancement", "initial_time_R", "regret", "procrastination", "failure")
goalDf_R_long <- goalRating_long_R[!goalRating_long_R$variable %in% goal_striving,]
# generate a multilevel model with subject as the random intercept
mlm <-lmer(rating ~ variable + (1|MTurkCode), data = goalDf_R_long)
# calculate the variance partition coefficient and transform to ICC
VarCorr(mlm) %>%
as_tibble() %>%
mutate(icc=vcov/sum(vcov)) %>%
dplyr :: select(grp, icc)
## # A tibble: 2 x 2
## grp icc
## <chr> <dbl>
## 1 MTurkCode 0.0862
## 2 Residual 0.914
Raw <- VarCorr(mlm) %>%
as_tibble() %>%
mutate(Raw=vcov/sum(vcov)) %>%
dplyr :: select(Raw)
26 variables are included. Ordinal variables are not included: “temporal_duration” & “end_state_specificity” and “frequency”; appoach_avoidance_R & attainment_maintainance_R are also dropped because these 2 variables are more relevant to the phrasing/content of a goal than the perception of a goal. This step is consistent with the SONA study
# Exclude the 8 variables related to goal striving progress
goalDf_R_wide <- goalDf_wide[,!names(goalDf_wide) %in% goal_striving]
# Exclude 5 goal representation variables and other columns with irrelevant data
goal_exclude <- c("temporal_duration", "end_state_specificity_R", "frequency_R", "attainment_maintenance_R", "approach_avoidance_R")
goalDf_EFA <- goalDf_R_wide[,!names(goalDf_R_wide) %in% goal_exclude]
goalDf_EFA <- subset(goalDf_EFA, select = affordance : visibility)
# Generate a correctional matrix
corrM_raw <- cor(goalDf_EFA, use = "pairwise")
# use Very Simple Structure criterion
res_vss <- psych :: nfactors(corrM_raw, n = 10, rotate = "promax", diagonal = FALSE, fm = "minres",
n.obs=791,title="Very Simple Structure",use="pairwise",cor="cor")
# select useful parameters and organize them into a table
cbind(1:10, res_vss$map) %>%
as.tibble() %>%
rename(., factor = V1, map = V2) %>%
cbind(., res_vss$vss.stats) %>%
select(factor, map, fit, complex, eChisq, SRMR, eCRMS, eBIC, eRMS) %>%
kable(format = "html", escape = F, caption = "VSS output after dropping 2 variables") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F,position = "center",fixed_thead = T)
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
| factor | map | fit | complex | eChisq | SRMR | eCRMS | eBIC | eRMS |
|---|---|---|---|---|---|---|---|---|
| 1 | 0.0301970 | 0.5610515 | 1.000000 | 8679.43583 | 0.1299274 | 0.1354587 | 6684.1197 | 0.1299274 |
| 2 | 0.0146179 | 0.6580092 | 1.251094 | 2573.42431 | 0.0707474 | 0.0770508 | 744.9407 | 0.0707474 |
| 3 | 0.0132862 | 0.7149469 | 1.414591 | 1311.03084 | 0.0504965 | 0.0575749 | -357.2937 | 0.0504965 |
| 4 | 0.0125204 | 0.7277976 | 1.574012 | 677.33499 | 0.0362958 | 0.0434296 | -837.5036 | 0.0362958 |
| 5 | 0.0139541 | 0.7352400 | 1.731892 | 449.17540 | 0.0295572 | 0.0372158 | -918.8507 | 0.0295572 |
| 6 | 0.0162217 | 0.7285363 | 1.891997 | 325.64092 | 0.0251666 | 0.0334470 | -902.2459 | 0.0251666 |
| 7 | 0.0186254 | 0.7108767 | 1.969130 | 224.28020 | 0.0208858 | 0.0294016 | -870.1407 | 0.0208858 |
| 8 | 0.0222770 | 0.6836729 | 2.094972 | 162.64913 | 0.0177861 | 0.0266280 | -804.9791 | 0.0177861 |
| 9 | 0.0252166 | 0.6749553 | 2.050097 | 117.82758 | 0.0151384 | 0.0242169 | -729.6813 | 0.0151384 |
| 10 | 0.0296479 | 0.6597003 | 1.803383 | 89.65666 | 0.0132052 | 0.0226982 | -644.4061 | 0.0132052 |
# Use the Scree plot to identify the number of factors have Eigenvalues >1 and the output from the Parallel analysis
ev <- eigen(corrM_raw)
ap <- parallel(subject=nrow(goalDf_EFA),var=ncol(goalDf_EFA),
rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)
### Extract factors
Extract number of factors based on the suggestions above. Because we expect factors to be correlated with each other, we use “promax” rotation.
# extract 4 factors
fa_raw_4 <-fa(r=corrM_raw, nfactors=5,n.obs = 854, rotate="promax", SMC=FALSE, fm="minres")
# extract 5 factors
fa_raw_5 <-fa(r=corrM_raw, nfactors=5,n.obs = 854, rotate="promax", SMC=FALSE, fm="minres")
# extract 6 factors
fa_raw_6 <-fa(r=corrM_raw, nfactors=6,n.obs = 854, rotate="promax", SMC=FALSE, fm="minres")
fa.diagram(fa_raw_4)
fa.diagram(fa_raw_5)
fa.diagram(fa_raw_6)
Compared to the 5 factors yield from the SONA study, the factor “measurability” is combined with “attainability”, and the factor ideal is new. It’s composed by item “ideal_motivation” (used to be in factor “importance”), “Control”(“measurability”), “meaningfulness”(“importance”)
# visualization
loadings <- fa.sort(fa_raw_5)$loadings
loadings <- as.data.frame(unclass(loadings))
colnames(loadings) <- c("ought", "importance", "attainability", "commonality", "ideal")
loadings$Variables <- rownames(loadings)
loadings.m <- loadings %>% gather(-Variables, key = "Factor", value = "Loading")
colOrder <- c("ought", "importance", "attainability", "commonality", "ideal")
rowOrder <- rev(rownames(loadings))
loadings.m<- arrange(mutate(loadings.m,Variables=factor(Variables,leve=rowOrder)),Variables)
loadings.m<- arrange(mutate(loadings.m,Factor=factor(Factor,leve=colOrder)),Factor)
ggplot(loadings.m, aes(Variables, abs(Loading), fill=Loading)) +
facet_wrap(~ Factor, nrow=1) + #place the factors in separate facets
geom_bar(stat="identity") + #make the bars
coord_flip() + #flip the axes so the test names can be horizontal
#define the fill color gradient: blue=positive, red=negative
scale_fill_gradient2(name = "Loading",
high = "orange", mid = "white", low = "midnightblue",
midpoint=0, guide="colourbar") +
ylab("Loading Strength") + #improve y-axis label +
ggtitle("Loadings for 5 factors") +
theme_bw(base_size=10)
The 5 factor loadings from the SONA study:
SONA 5-factor
fa_raw_5$Phi %>%
as.tibble() %>%
dplyr::rename(ought = MR1, importance = MR2, attainability = MR3, commonality = MR4, ideal = MR5) %>%
round(.,2) %>%
remove_rownames() %>%
mutate(factor = colnames(.)) %>%
select(factor, everything()) %>%
kable(format = "html", escape = F, caption = "Interfactor Correlation") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F,position = "center",fixed_thead = T)
| factor | ought | importance | attainability | commonality | ideal |
|---|---|---|---|---|---|
| ought | 1.00 | 0.24 | 0.10 | 0.13 | 0.25 |
| importance | 0.24 | 1.00 | 0.35 | 0.37 | 0.25 |
| attainability | 0.10 | 0.35 | 1.00 | 0.04 | 0.23 |
| commonality | 0.13 | 0.37 | 0.04 | 1.00 | -0.06 |
| ideal | 0.25 | 0.25 | 0.23 | -0.06 | 1.00 |
Compared to the 6 factors yield from the SONA study, the “instrumentality” is replaced by the factor “ideal”.
# visualization
loadings <- fa.sort(fa_raw_6)$loadings
loadings <- as.data.frame(unclass(loadings))
colnames(loadings) <- c("ought", "importance", "measurability", "commonality", "ideal", "attainability")
loadings$Variables <- rownames(loadings)
loadings.m <- loadings %>% gather(-Variables, key = "Factor", value = "Loading")
colOrder <- c("ought", "importance", "measurability", "commonality", "ideal", "attainability")
rowOrder <- rev(rownames(loadings))
loadings.m<- arrange(mutate(loadings.m,Variables=factor(Variables,leve=rowOrder)),Variables)
loadings.m<- arrange(mutate(loadings.m,Factor=factor(Factor,leve=colOrder)),Factor)
ggplot(loadings.m, aes(Variables, abs(Loading), fill=Loading)) +
facet_wrap(~ Factor, nrow=1) + #place the factors in separate facets
geom_bar(stat="identity") + #make the bars
coord_flip() + #flip the axes so the test names can be horizontal
#define the fill color gradient: blue=positive, red=negative
scale_fill_gradient2(name = "Loading",
high = "orange", mid = "white", low = "midnightblue",
midpoint=0, guide="colourbar") +
ylab("Loading Strength") + #improve y-axis label +
ggtitle("Loadings for 6 factors") +
theme_bw(base_size=10)
The 6 factor loadings from the SONA study:
#### interfactor correlation
fa_raw_6$Phi %>%
as.tibble() %>%
dplyr::rename(ought = MR1, importance = MR6, measurability = MR3, commonality = MR4, ideal = MR5, attainability = MR2) %>%
round(.,2) %>%
remove_rownames() %>%
mutate(factor = colnames(.)) %>%
select(factor, everything()) %>%
kable(format = "html", escape = F, caption = "Interfactor Correlation") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F,position = "center",fixed_thead = T)
| factor | ought | importance | measurability | commonality | ideal | attainability |
|---|---|---|---|---|---|---|
| ought | 1.00 | 0.32 | -0.05 | 0.16 | 0.12 | 0.23 |
| importance | 0.32 | 1.00 | 0.30 | 0.37 | 0.22 | 0.29 |
| measurability | -0.05 | 0.30 | 1.00 | 0.18 | 0.20 | 0.20 |
| commonality | 0.16 | 0.37 | 0.18 | 1.00 | -0.07 | -0.04 |
| ideal | 0.12 | 0.22 | 0.20 | -0.07 | 1.00 | 0.06 |
| attainability | 0.23 | 0.29 | 0.20 | -0.04 | 0.06 | 1.00 |
# generate a dataframe
fa_fitDf <- data.frame(factors = c(5,6),
chi = c(fa_raw_5$chi,fa_raw_6$chi),
BIC = c(fa_raw_5$BIC,fa_raw_6$BIC),
fit = c(fa_raw_5$fit,fa_raw_6$fit),
RMSEA = c(fa_raw_5$RMSEA[1],fa_raw_6$RMSEA[1]),
cumVar = c(max(fa_raw_5$Vaccounted[3,]), max(fa_raw_6$Vaccounted[3,])),
complexity = c(mean(fa_raw_5$complexity),mean(fa_raw_6$complexity)))
fa_fitDf
## factors chi BIC fit RMSEA cumVar complexity
## 1 5 484.9504 -680.9335 0.8555492 0.05331112 0.4384451 1.731892
## 2 6 351.5769 -669.4881 0.8679743 0.04970919 0.4579989 1.891997